{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "fc3668d00f2a605f",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# Dataset Creation\n",
    "\n",
    "## QWK Dataset \n",
    "\n",
    "First, the QWK dataset is created that used the 5-fold cross validation. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3985d774",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c6da7f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install pandas\n",
    "!pip install datasets\n",
    "!pip install openpyxl\n",
    "!pip install transformers\n",
    "!pip install torch torchvision torchaudio\n",
    "!pip install scikit-learn\n",
    "!pip install langchain\n",
    "!pip install langchain_community\n",
    "!pip install vllm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33246726de77ad56",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datasets import Dataset, DatasetDict\n",
    "\n",
    "df = pd.read_excel(\"../data/training_set_rel3.xlsx\", index_col=\"essay_id\")\n",
    "df = df[df[\"domain1_score\"].notnull()]\n",
    "\n",
    "# load the folds from the txt files \n",
    "folds = []\n",
    "for i in range(5):\n",
    "    dataset = {\"train\": [], \"dev\": [], \"test\": []}\n",
    "    for split in dataset.keys():\n",
    "        ids = []\n",
    "        import os\n",
    "\n",
    "        # Create the directory if it does not exist\n",
    "        os.makedirs(f\"../data/splits/fold_{i}\", exist_ok=True)\n",
    "\n",
    "        # Create the file if it does not exist\n",
    "        if not os.path.exists(f\"../data/splits/fold_{i}/{split}_ids.txt\"):\n",
    "            with open(f\"../data/splits/fold_{i}/{split}_ids.txt\", \"w\") as f:\n",
    "                pass\n",
    "        with open(f\"../data/splits/fold_{i}/{split}_ids.txt\", \"r\") as f:\n",
    "            for line in f:\n",
    "                ids.append(int(line.strip()))\n",
    "        if 10534 in ids:  # this line was only containing an essay \n",
    "            ids.remove(10534)\n",
    "        dataset[split] = df.loc[ids]\n",
    "    folds.append(dataset)\n",
    "\n",
    "fold_datasets = []\n",
    "\n",
    "for i in range(5):\n",
    "    dataset = {\"train\": [], \"dev\": [], \"test\": []}\n",
    "    for split in dataset.keys():\n",
    "        dataset[split] = Dataset.from_pandas(folds[i][split], split=split)\n",
    "    fold_datasets.append(DatasetDict(dataset))\n",
    "\n",
    "folds = DatasetDict({\"fold_{}\".format(i): fold_datasets[i] for i in range(5)})\n",
    "\n",
    "folds.save_to_disk(\"./data/datasets/folds\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
